The following graph study will focus on a dataset of newly sold cars in the United Kingdom from 2000 to 2013.
The emphasis will be on CO2 emissions from those cars and other related variables.
But before, another dataset is used to give an overview of worldwide emissions in the transportation sector.
As GDP per capita increases, how does the CO2 emissions produced by the transportation sector change? Furthermore, what if we factor in the population?
ggplot(data_worldb, aes(gdp_per_capita, co2_fromtransport_percent_total, size = pop_total, colour = continent)) +
geom_point(alpha=0.5) +
scale_x_log10() +
scale_size(range = c(2, 10)) +
scale_color_manual(values=c("#003f5c", "#665191", "#d45087", "#ff7c43", "#ffa600", "#1ce3cd"))+
theme(legend.position="right")+
labs(x = "GDP per Capita (log scale)", y="CO2 emissions from transportation (%)") +
theme(axis.text=element_text(size=6),
axis.title=element_text(size=8))
How have the CO2 emissions from the sector changed over the years?
ggplot(data_worldb,aes(year, co2_fromtransport_percent_total, group = country, color = factor(continent))) +
geom_line() +
scale_color_manual(values=c("#003f5c", "#665191", "#d45087", "#ff7c43", "#ffa600", "#1ce3cd"))+
labs(x = "year", y = "CO2 emissions from transportation (%)") +
theme(legend.position = "right")+
theme(axis.text=element_text(size=6),
axis.title=element_text(size=8))+
geom_point()+
theme(axis.title.x = element_blank()) +
transition_reveal(year)
A closer look by continent:
ggplot() +
geom_line(data=data_worldb, aes (year, co2_fromtransport_percent_total, group = country, color= continent), lwd = 0.4, show.legend = FALSE) +
facet_wrap(~ continent, ncol=3, strip.position = "bottom") +
scale_color_manual(values=c("#003f5c", "#665191", "#d45087", "#ff7c43", "#ffa600", "#1ce3cd"))+
#geom_smooth(data=data_worldb, aes(year, co2_fromtransport_percent_total, group = 1), lwd = 2, method = 'loess', span = 2, se = TRUE, color = decoration_color) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(x = "Year", y="CO2 emissions from transportation (%)") +
theme(axis.text=element_text(size=6),
axis.title=element_text(size=8),
strip.text = element_text(size = 9))+
theme(axis.title.x = element_blank())+
theme(axis.title.y = element_blank())
What about Europe alone?
ggplot(subset(data_worldb, continent == "Europe")) +
geom_line(aes(year, co2_fromtransport_percent_total, group = country), color="#d45087", show.legend = FALSE) +
labs(x = "Year", y="CO2 emissions from transport (%)") +
theme(axis.text=element_text(size=6),
axis.title=element_text(size=8),
strip.text = element_text(size = 9))+
theme(axis.title.x = element_blank())+
theme(axis.title.y = element_blank())
A closer look at each country within Europe:
ggplot(subset(data_worldb, continent == "Europe"), aes(year, co2_fromtransport_percent_total)) +
geom_line(color="#d45087") +
facet_wrap(~country, ncol=4) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Year", y="CO2 from transport (%)") +
theme(axis.text=element_text(size=6),
axis.title=element_text(size=8),
strip.text = element_text(size = 9))+
theme(axis.title.x = element_blank())+
theme(axis.title.y = element_blank())
A focus on the United Kingdom that will be further analyzed in the second part:
ggplot(subset(data_worldb, country == "United Kingdom"), aes(year, co2_fromtransport_percent_total)) +
geom_line(color="#d45087", lwd=0.8) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
labs(x = "Year", y="CO2 from transport (%) for the United Kingdom") +
theme(axis.text=element_text(size=6),
axis.title=element_text(size=8),
strip.text = element_text(size = 9))+
theme(axis.title.x = element_blank())+
theme(axis.title.y = element_blank())
With a general overview on the increasing trend of CO2 emissions from the transportation sector, we can now turn to analyzing the main dataset. This dataset is about the new cars sold in the United Kingdom between 2000 and 2013.
First looking at the distribution of the most important categorical variables:
1 - Distribution of Car Brand manufacturer in the dataset
data %>%
count(manufacturer) %>%
ggplot(aes(x = reorder(manufacturer, n, sum), y = n)) +
geom_col(fill=main2_color) +
coord_flip()+
labs(x = "Brands", y="Number of cars") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))+
theme(axis.title.x = element_blank())
2 - Distribution of years, transmission types, fuel types, and Eurostandards
# 2 - Distribution of years in the dataset
years_grid<-data %>%
count(year) %>%
ggplot(aes(x = reorder(year, n, sum), y = n)) +
geom_col(fill=main2_color) +
coord_flip()+
labs(x = "Year", y="Number of cars") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))+
theme(axis.title.y = element_blank())
#data %>% count(year, sort=T)
# 3 - Distribution of car transmission type
trans_type_grid<-data %>%
count(transmission_type) %>%
ggplot(aes(x = reorder(transmission_type, n, sum), y = n)) +
geom_col(fill=main2_color) +
coord_flip()+
labs(x = "Types of Transmission", y="Number of cars") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))+
theme(axis.title.y = element_blank())
#data %>% count(transmission_type, sort=T)
# 4 - Distribution of car fuel type
fueltype_grid<-data %>%
count(fuel_type) %>%
ggplot(aes(x = reorder(fuel_type, n, sum), y = n)) +
geom_col(fill=main2_color) +
coord_flip()+
labs(x = "Types of Fuel", y="Number of cars") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))+
theme(axis.title.y = element_blank())
#data %>% count(fuel_type, sort=T)
# 5 - Distribution of the car euro standards
euro_grid<-data %>%
count(euro_standard) %>%
ggplot(aes(x = reorder(euro_standard, n, sum), y = n)) +
geom_col(fill=main2_color) +
coord_flip()+
labs(x = "Category of Eurostandards", y="Number of cars") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))+
theme(axis.title.y = element_blank())
grid.arrange(euro_grid,fueltype_grid,trans_type_grid,years_grid)
Then looking at the distribution of continuous variables:
3 - Distribution of the cars’ engine capacity, noise levels, and types of emissions
# 1 - Distribution of the cars' engine capacity
engine_grid<-ggplot(data = data) +
geom_histogram(mapping = aes(x = engine_capacity), binwidth = 700, fill=main2_color)+
labs(x = "Engine capacity", y="Number of cars") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))+
theme(axis.title.y = element_blank())
#data %>% count(cut_width(engine_capacity, 500))
# 2 - Distribution of the cars' noise level
noiselevel_grid<-ggplot(data = data) +
geom_histogram(mapping = aes(x = noise_level), binwidth = 2,fill=main2_color) +
xlim(60,90)+
labs(x = "Noise level", y="Number of cars") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))+
theme(axis.title.y = element_blank())
#ggplot(data = data) + geom_histogram(mapping = aes(x = noise_level), binwidth = 2)
#here it looks like we have outliers, so will put limits to better see the data:
#data %>% count(cut_width(noise_level, 2))
#we see that we only have 4 values largely below 63
# 3 - Distribution of the different types of emissions
#co2
co2_ht<-ggplot(data = data) +
geom_histogram(mapping = aes(x = co2), binwidth = 20,fill=main2_color)+
labs(x = "CO2 emissions", y="Number of cars") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))
#co_emissions
co_emissions_ht<-ggplot(data = data) +
geom_histogram(mapping = aes(x = co_emissions), binwidth = 50,fill=main2_color) +
xlim(0,2125)+
labs(x = "CO emissions", y="Number of cars") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))+
theme(axis.title.y = element_blank())
#nox_emissions
nox_emissions_ht<-ggplot(data = data) +
geom_histogram(mapping = aes(x = nox_emissions), binwidth = 50,fill=main2_color) +
xlim(0, 1025)+
labs(x = "NOX emissions", y="Number of cars") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))+
theme(axis.title.y = element_blank())
#thc_nox_emissions
thc_nox_emissions_ht<-ggplot(data = data) +
geom_histogram(mapping = aes(x = thc_nox_emissions), binwidth = 30,fill=main2_color)+
labs(x = "THC NOX emissions", y="Number of cars") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))+
theme(axis.title.y = element_blank())
#thc_emissions
thc_emissions_ht<-ggplot(data = data) +
geom_histogram(mapping = aes(x = thc_emissions), binwidth = 30,fill=main2_color) +
xlim(0, 300)+
labs(x = "THC emissions", y="Number of cars") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))
grid.arrange(co2_ht,co_emissions_ht,nox_emissions_ht,
thc_nox_emissions_ht,thc_emissions_ht,engine_grid, noiselevel_grid, nrow=2)
With a proper overview of the distributions of the data, we can now turn to analyzing the variables to detect trends.
What can we detect from engine capacity and fuel types?
ggplot(data, aes(engine_capacity, colour = fuel_type)) +
geom_freqpoly() +
labs(x = "Engine Capacity", y="Number of cars") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))+
theme(legend.position = c(0.815, 0.6))
Again the domination of two types of fuel (Diesel and Petrol) is apparant.
Focusing on these main types could be more interesting:
ggplot(data_filtered_fuel_type, aes(engine_capacity, colour = fuel_type)) +
#scale_colour_brewer(palette = "Paired") +
geom_histogram(colour=decoration_color, fill=decoration_color, alpha=0.2, size=0) +
geom_freqpoly()+
labs(x = "Engine Capacity", y="Number of cars") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))
Zooming in to look at engine capacities above 2500:
#Zooming in
ggplot(data_filtered_fuel_type, aes(engine_capacity, colour = fuel_type)) +
#scale_colour_brewer(palette = "Paired") +
geom_histogram(colour=decoration_color, fill=decoration_color, alpha=0.2, size=0) +
geom_freqpoly() +
xlim(2500, 5000)+
labs(x = "Engine Capacity", y="Number of cars") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))
As soon as the engine capacity reaches a value of 3500, Petrol clearly stands out.
Car engine categorization was set up by computing the minimum, maximum and average capacity of each car brand. After obtaining these three metrics the average was plotted and gave a normal distribution, after that the parameters where set up: Low Engine -599 - 1368 Medium Engine 1369 - 2004 High Engine <2005.
Is engine capacity related to noise levels emitted?
ggplot(data_classif, aes(x=noise_level, y=co2)) +
geom_point(color=main2_color, size=0.5, alpha=0.09)+
facet_wrap( ~ Type, ncol=3) +
labs(x = "Noise level", y="CO2 emissions") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))
Now what if we select one brand from each engine category and compare all types of emissions?
data_3_engine_capacity <- data[(data$manufacturer == "Smart" | data$manufacturer == "Kia" | data$manufacturer == "Ferrari" )]
ggparcoord(data_3_engine_capacity, columns = c(18, 19, 20, 22), alphaLines = 0.3, groupColumn = 3, scale = "std")+
scale_color_manual(values=c("#2f4b7c", "#d45087", "#ffa600"))+
ylim(-2.5, 5) +
facet_wrap(. ~ manufacturer, ncol=3) +
labs(x = "", y="Standardized Value") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))+
theme(legend.position="right")
The relationship between CO2 emissions, engine capacity, and fuel type:
ggplot(data=data_filtered_fuel_type, aes(x=engine_capacity, y=co2, group=fuel_type, shape=fuel_type, color=fuel_type)) +
geom_line(size=0.5, alpha=0.1) +
geom_point(size=1, fill="white", alpha=0.1) +
scale_shape_manual(values=c(22,21)) +
stat_smooth()+
theme(legend.position = c(0.815, 0.2))+
labs(x = "Engine capacity", y="CO2 emissions") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))
The relationship between CO2 emissions and transmission types:
data_filtered_fuel_and_transmission_type<-data_filtered_fuel_type[(data_filtered_fuel_type$transmission_type=="Manual" |data_filtered_fuel_type$transmission_type=="Automatic" )]
ggplot(data_filtered_fuel_and_transmission_type, aes(x = co2, y = transmission_type, fill=transmission_type)) +
geom_density_ridges(color=fill_color, scale = 40, size=0.1, alpha=0.7) +
scale_fill_manual(values=c("#ffa600", "#2f4b7c"))+
labs(x = "CO2 emissions", y="") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))+
theme(legend.position = "none")
Which car brands produce the most powerful engines on average?
data_filtered_fuel_type %>%
as.tibble() %>%
ggplot(aes(reorder(manufacturer,engine_capacity,mean), engine_capacity)) +
geom_tufteboxplot(outlier.colour="transparent", color= main2_color) +
annotate("text", x = 10, y = 120, adj=1, family="serif", label = c("")) +
coord_flip()+
labs(x = "Car Brands", y="Engine Capacity") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))+
theme(axis.title.x = element_blank())
The brands producing the most powerful engines seem to be the luxurious brands.
Which car brands produce the most CO2 emissions?
data_filtered_fuel_type %>%
as.tibble() %>%
ggplot(aes(reorder(manufacturer,co2,max), co2)) +
geom_tufteboxplot(outlier.colour="transparent", color= main2_color) +
annotate("text", x = 10, y = 120, adj=1, family="serif", label = c("")) +
coord_flip()+
labs(x = "Car Brands", y="CO2 Emissions") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))+
theme(axis.title.x = element_blank())
The same trend is repeated for the CO2 emissions, the luxurious cars produce the maximum amount of CO2 emmissions as compared to other brands.
And according to countries of origin, how is the engine capacity distributed?
ggplot(data_country, aes(x = engine_capacity, y = co2)) +
geom_boxplot(position = position_dodge(),fill=main2_color, colour=decoration_color) +
facet_wrap(~country_origin) +
labs(x = "", y="") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))+
theme(axis.title.x = element_blank())
Italy looks to have the higher emitting cars while France and Spain produce lower engine capacities with medium emissions. Germany and England seem to have the same distributions.
pp <- ggplot(data_classif, aes(euro_standard, co2)) +
geom_point(size=0.02, alpha=0.09, color=main2_color) +
labs(x = "Eurostandards", y="CO2 Emissions") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))
ggMarginal(pp, type = "density", fill=main2_color, alpha=1, color='transparent')
Indeed, as the CO2 Emissions decreases, the Eurostandard of the car increases.
ggplot(data_worldb, aes(gdp_per_capita, co2_fromtransport_percent_total, size = pop_total, colour = continent)) +
geom_point(alpha=0.5) +
scale_x_log10() +
scale_size(range = c(2, 10)) +
scale_color_manual(values=c("#003f5c", "#665191", "#d45087", "#ff7c43", "#ffa600", "#1ce3cd"))+
theme(legend.position="right")+
labs(x = "GDP per Capita (log scale)", y="CO2 emissions from transportation (%)") +
theme(axis.text=element_text(size=6),
axis.title=element_text(size=8))
ggplot(data = data_filtered_fuel_type) +
geom_point(mapping = aes(x = engine_capacity, y = co2), alpha =0.1, colour = main2_color)+
labs(x = "Engine capacity", y="CO2 emissions") +
theme(axis.text=element_text(size=7),
axis.title=element_text(size=8),
strip.text = element_text(size = 10))